1 Investigating MLL samples

2 Transcriptional profile PCA and differential gene expression (DGA)

2.1 Sorting

Isolated cells were sorted using a variety of cell markers

2.2 RNA-seq

RNA was isolated and sequenced from the various cell types.

2.3 Sample summary

sample_table <- read.delim('sample.table', stringsAsFactors = FALSE)

rownames(sample_table) <- sample_table$Nextera_XT_index

sample_table$Nextera_XT_index <- NULL
sample_table$Replicates. <- NULL
sample_table$deseq_groups <- c("1","6","3","2","4","1","3","2","5","1","5","2","5","1","3","6","2","6","3")
datatable(sample_table) %>% 
  formatStyle("Phenotype", backgroundColor = styleEqual(c("CD33+","CD33+GFP-","CD33+GFP+","CD19+","CD19+GFP+"), c("#EAD3BF","#AA9486", "#B6854D","#9986A5","#79402E")))

2.4 PCA

2.4.1 PCA working with FPKMS

# fpkm_data <- purrr::map(paste0("myriad/",rownames(sample_table),"/replicate_1/stringtie/",rownames(sample_table),".gene_abund.tab"), read.table, stringsAsFactors=FALSE, sep="\t", header = TRUE) %>% reduce(left_join, by = "Gene.ID") %>% select("Gene.ID","Gene.Name",contains("FPKM"))
# 
# colnames(fpkm_data) <- c("Gene.ID","Gene.Name",rownames(sample_table))
# saveRDS(fpkm_data,"fpkm_data.rds")
fpkm_data <- readRDS("fpkm_data.rds")
## TPM data
# tpm_data <- purrr::map(paste0("myriad/",rownames(sample_table),"/replicate_1/stringtie/",rownames(sample_table),".gene_abund.tab"), read.table, stringsAsFactors=FALSE, sep="\t", header = TRUE) %>% reduce(left_join, by = "Gene.ID") %>% select("Gene.ID","Gene.Name",contains("TPM"))
# 
# colnames(tpm_data) <- c("Gene.ID","Gene.Name",rownames(sample_table))
# saveRDS(tpm_data , "tpm_data.rds")

tpm_data <- readRDS("tpm_data.rds")

## Filter genes with a low expression throughout
# - select rows with max value >= 2
fpkm_data <- fpkm_data[apply(fpkm_data[,3:21], 1, max) >= 2,]

fpkms_counts_only <- fpkm_data[,3:21]
## Log transform the data
log.data = log2(fpkms_counts_only+1)

colnames(log.data) <- colnames(fpkm_data[,3:21])
rownames(log.data) <- make.names(fpkm_data$Gene.Name,unique = TRUE)
## Transpose the matrix back for PCA
t_log.data = t(log.data)

fit <- prcomp(t_log.data)

x="PC1"
y="PC2"
x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == x)]^2/sum(fit$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == y)]^2/sum(fit$sdev^2)))
data <- data.frame(fit$x)
data<-rownames_to_column(data, var = "sample")
data<-left_join(data,rownames_to_column(sample_table, var = "sample_id"), by = c("sample" = "sample_id"))

ggplot(data,aes(PC1,PC2, colour = Phenotype )) + geom_point(size = 3 ) + 
  geom_label_repel(aes(label = Samples), size = 2,box.padding   = 0.1,point.padding = 0.1,
                   segment.color = 'grey50') + theme_minimal() + xlab(x.lab) + ylab(y.lab) +
   ggtitle(label = "FPKM PCA; no quantile normalization")

2.4.2 PCA and quantile normalization

## Quantile normalization
normalized.data = normalize.quantiles(as.matrix(log.data)) # A quantile normalization.

## Transpose the matrix back for PCA
normalized.data = t(normalized.data)

## re-attach rownames colnames
rownames(normalized.data) <- colnames(log.data)
colnames(normalized.data) <- rownames(log.data)

fit <- prcomp(normalized.data)

x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == x)]^2/sum(fit$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == y)]^2/sum(fit$sdev^2)))
data <- data.frame(fit$x)
data<-rownames_to_column(data, var = "sample")
data<-left_join(data,rownames_to_column(sample_table, var = "sample_id"), by = c("sample" = "sample_id"))

ggplot(data,aes(PC1,PC2, colour = Phenotype )) + geom_point(size = 3 ) + 
  geom_label_repel(aes(label = Samples), size = 2,box.padding   = 0.1,point.padding = 0.1,
                   segment.color = 'grey50') + theme_minimal() + xlab(x.lab) + ylab(y.lab) +
  ggtitle(label = "FPKM PCA; with quantile normalization")

2.4.3 TPMs without normalization

## Filter genes with a low expression throughout
# - select rows with max value >= 2
tpm_data <- tpm_data[apply(tpm_data[,3:21], 1, max) >= 2,]

tpm_counts_only <- tpm_data[,3:21]
## Log transform the data
log.data = log2(tpm_counts_only+1)

colnames(log.data) <- colnames(tpm_data[,3:21])
rownames(log.data) <- make.names(tpm_data$Gene.Name,unique = TRUE)
## Transpose the matrix back for PCA
t_log.data = t(log.data)

fit <- prcomp(t_log.data)

x="PC1"
y="PC2"
x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == x)]^2/sum(fit$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == y)]^2/sum(fit$sdev^2)))
data <- data.frame(fit$x)
data<-rownames_to_column(data, var = "sample")
data<-left_join(data,rownames_to_column(sample_table, var = "sample_id"), by = c("sample" = "sample_id"))

ggplot(data,aes(PC1,PC2, colour = Phenotype )) + geom_point(size = 3 ) + 
  geom_label_repel(aes(label = Samples), size = 2,box.padding   = 0.1,point.padding = 0.1,
                   segment.color = 'grey50') + theme_minimal() + xlab(x.lab) + ylab(y.lab) +
   ggtitle(label = "TPM PCA; no quantile normalization")

##PCA with counts ###Using scater normalization

# big_counts <- paste0("myriad/",rownames(sample_table),"/replicate_1/qc/",rownames(sample_table), "_QC.summary.txt/QC.geneCounts.formatted.for.DESeq.txt.gz") %>% map(gzfile) %>% map(read.table, stringsAsFactors = FALSE, sep = "\t")  %>% reduce(left_join, by = "V1") 
# 
# colnames(big_counts) <- c("ensgene", rownames(sample_table))
# rownames(big_counts) <- big_counts$ensgene
# big_counts$ensgene <- NULL
# big_counts <- big_counts[!is.na(rowSums(big_counts)),]
# saveRDS(big_counts,"big_counts.rds")
big_counts <- readRDS("big_counts.rds")
count_matrix <- as.matrix(big_counts)
colnames(count_matrix) <- rownames(sample_table)
rownames(count_matrix) <- rownames(big_counts)

sce <- SingleCellExperiment(list(counts = count_matrix),
                            colData = sample_table)
sce <- sce[rowSums(counts(sce)) >  0,]
sce <- calculateQCMetrics(sce)
## prepare total count and total features data

my_df <- data.frame("sample_id" = colnames(sce), "total_counts" = sce$total_counts,
                    "total_features" = sce$total_features_by_counts )
ggplot(my_df,aes(total_counts)) + geom_histogram(bins = 10) + 
  theme_minimal() + ggtitle("Histogram of total counts") + ylab("number of samples") + xlab("total counts")

ggplot(my_df,aes(total_features)) + geom_histogram(bins = 10) + 
  theme_minimal() + ggtitle("Histogram of total features") + ylab("number of samples") + xlab("total features")

###PCA without batch effect removal

sizeFactors(sce) <- librarySizeFactors(sce)
sce <- normalize(sce)

sce <- runPCA(sce)

data <- reducedDim(sce)
percent_var_PC1<- paste("PC1", sprintf('(%0.1f%% explained var.)', attr(data, 'percentVar')[1] * 100 ))
percent_var_PC2<- paste("PC2", sprintf('(%0.1f%% explained var.)', attr(data, 'percentVar')[2] * 100 ))

data <- data.frame(data)
data<-left_join(rownames_to_column(data), rownames_to_column(sample_table))

ggplot(data,aes(PC1,PC2,colour = Phenotype)) + geom_point(size = 3) + 
  geom_label_repel(label = data$Samples, size = 2,box.padding   = 0.1,point.padding = 0.1,
                   segment.color = 'grey50') + theme_minimal() + xlab(percent_var_PC1) + ylab(percent_var_PC2)

2.5 DEseq2

big_counts <- big_counts[-c(58736:58739),]

dds <- DESeqDataSetFromMatrix(big_counts, colData = sample_table, design = ~deseq_groups)
my_vst <- vst(dds)

pcaData <- DESeq2::plotPCA(my_vst, intgroup =c("Phenotype","Samples"),returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))


ggplot(pcaData,aes(PC1,PC2,colour = Phenotype)) + geom_point(size = 3 ) +
        xlab(paste0("PC1: ",percentVar[1],"% variance")) +
        ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
          geom_label_repel(aes(label = pcaData$Samples), size = 2,box.padding   = 0.1,point.padding = 0.1,
                   segment.color = 'grey50') +
        theme_minimal()

2.5.1 Genes driving PC1 and PC2

top_contribs = function(object) {
  # calculate the variance for each gene
  rv <- rowVars(assay(object))

  # select the 1000 top genes by variance
  select <- order(rv, decreasing=TRUE)[seq_len(min(1000, length(rv)))]

  # perform a PCA on the data in assay(x) for the selected genes
  pca <- prcomp(t(assay(object)[select,]))

  # the contribution to the total variance for each component
  percentVar <- pca$sdev^2 / sum( pca$sdev^2 )

  # Top 20 contributers to PC1 PC2
  PCA1_contrib <- sort(abs(pca$rotation[,1]), decreasing = TRUE )[1:20]
  PCA2_contrib <- sort(abs(pca$rotation[,2]), decreasing = TRUE )[1:20]
  PCA_contrib <- c(PCA1_contrib, PCA2_contrib)
  PCA_contrib <- data.frame("ensgene" = names(PCA_contrib), PCA_contrib)



  PCA_contrib <- cbind(PCA_contrib, data.frame("PC" = c(rep("PC1",20),rep("PC2","20"))))
  PCA_contrib <- left_join(PCA_contrib, grch38, by = "ensgene") %>% dplyr::select("PC","symbol", "biotype")
  return(PCA_contrib)
}

datatable(top_contribs(my_vst))

2.6 Differential Gene Expression

2.6.1 CD33+ vs CD33+GFP+

dds$deseq_groups <- relevel(dds$deseq_groups,ref = "2")

deseq <- DESeq(dds)

res <- results(deseq,contrast = c("deseq_groups", "1", "2"))
resOrdered <- res[order(res$padj),]
resOrdered <- resOrdered[complete.cases(resOrdered),]
resOrdered <- data.frame(resOrdered)
resOrdered <- left_join(rownames_to_column(resOrdered,var = "ensgene"), grch38, by = "ensgene") %>% select(symbol,log2FoldChange,padj)

res_for_table <- resOrdered  %>% dplyr::filter(padj < 0.1)
datatable(res_for_table, extensions = 'Buttons', options = list(dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel')))

2.6.2 GSEA CD33+ vs CD33+GFP+

hallmark <- gmtPathways("~/genome_apps/GSEA/h.all.v6.2.symbols.gmt")

plot_gsea <- function(res){
res <- res[complete.cases(res),]
res$fcSign <- sign(res$log2FoldChange)
res$logP <- -log10(res$padj)
res$metric <- res$logP/res$fcSign
y<-res[,c("symbol", "metric")]
geneList <- y$metric
names(geneList) <- y$symbol
geneList <- geneList[order(geneList, decreasing = T)]
fgseaRes <- fgsea(hallmark, geneList, nperm=10000, maxSize = 500)
fgseaRes <- fgseaRes[fgseaRes$padj < 0.05,]
plotGseaTable(hallmark[fgseaRes$pathway], geneList, fgseaRes, gseaParam = 0.4)
}

plot_gsea(resOrdered)

2.6.3 DGE CD33+GFP- vs CD33+GFP+

res <- results(deseq,contrast = c("deseq_groups", "3", "2"))
resOrdered <- res[order(res$padj),]
resOrdered <- resOrdered[complete.cases(resOrdered),]
resOrdered <- data.frame(resOrdered)
resOrdered <- left_join(rownames_to_column(resOrdered,var = "ensgene"), grch38, by = "ensgene") %>% select(symbol,log2FoldChange,padj) 

res_for_table <- resOrdered %>% dplyr::filter(padj < 0.1)
datatable(res_for_table, extensions = 'Buttons', options = list(dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel')))

2.6.4 GSEA CD33+GFP- vs CD33+GFP+

plot_gsea(resOrdered)

2.6.5 DGE CD19+ vs CD19+GFP+

dds$deseq_groups <- relevel(dds$deseq_groups,ref="4")
res <- results(deseq,contrast = c("deseq_groups", "5", "4"))
resOrdered <- res[order(res$padj),]
resOrdered <- resOrdered[complete.cases(resOrdered),]
resOrdered <- data.frame(resOrdered)
resOrdered <- left_join(rownames_to_column(resOrdered,var = "ensgene"), grch38, by = "ensgene") %>% select(symbol,log2FoldChange,padj) 

res_for_table <- resOrdered %>% dplyr::filter(padj < 0.1)
datatable(res_for_table, extensions = 'Buttons', options = list(dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel')))

2.6.6 GSEA CD19+ vs CD19+GFP+

plot_gsea(resOrdered)

---
title: "Exploring MLL Samples"
author: <a href="https://chela-james.github.io/"> <h3> Chela James </h3> </a> \newline Cancer Institute, UCL, UK
date: 15 Aprl 2019
output:
  bookdown::html_document2:
    toc: true
    toc_float: true
    toc_depth: 3
    number_sections: true
    theme: united
    highlight: tango
    df_print: paged
    code_folding: hide
    code_download: true
    self_contained: true
    keep_md: false
    encoding: "UTF-8"
editor_options: 
  chunk_output_type: console
---

```{r setup, include=FALSE}
htmltools::tagList(rmarkdown::html_dependency_font_awesome())
knitr::opts_chunk$set(eval = TRUE, cache = FALSE, warning = FALSE, message = FALSE)
```


```{r,  out.width = "100%", echo = FALSE, warning = FALSE}
library(knitr)
library(ggplot2)
library(sva)
library(dplyr)
library(preprocessCore)
library(ggrepel)
library(Biobase)
library(tibble)
library(scater)
library(gghighlight)
library(annotables)
library(DESeq2)
library(fgsea)
library(DT)
library(purrr)
```

# Investigating MLL samples

# Transcriptional profile PCA and differential gene expression (DGA)

## Sorting

Isolated cells were sorted using a variety of cell markers

## RNA-seq

RNA was isolated and sequenced from the various cell types.

## Sample summary

```{r}
sample_table <- read.delim('sample.table', stringsAsFactors = FALSE)

rownames(sample_table) <- sample_table$Nextera_XT_index

sample_table$Nextera_XT_index <- NULL
sample_table$Replicates. <- NULL
sample_table$deseq_groups <- c("1","6","3","2","4","1","3","2","5","1","5","2","5","1","3","6","2","6","3")
datatable(sample_table) %>% 
  formatStyle("Phenotype", backgroundColor = styleEqual(c("CD33+","CD33+GFP-","CD33+GFP+","CD19+","CD19+GFP+"), c("#EAD3BF","#AA9486", "#B6854D","#9986A5","#79402E")))
```

## PCA 

### PCA working with FPKMS 

```{r}
# fpkm_data <- purrr::map(paste0("myriad/",rownames(sample_table),"/replicate_1/stringtie/",rownames(sample_table),".gene_abund.tab"), read.table, stringsAsFactors=FALSE, sep="\t", header = TRUE) %>% reduce(left_join, by = "Gene.ID") %>% select("Gene.ID","Gene.Name",contains("FPKM"))
# 
# colnames(fpkm_data) <- c("Gene.ID","Gene.Name",rownames(sample_table))
# saveRDS(fpkm_data,"fpkm_data.rds")
fpkm_data <- readRDS("fpkm_data.rds")
## TPM data
# tpm_data <- purrr::map(paste0("myriad/",rownames(sample_table),"/replicate_1/stringtie/",rownames(sample_table),".gene_abund.tab"), read.table, stringsAsFactors=FALSE, sep="\t", header = TRUE) %>% reduce(left_join, by = "Gene.ID") %>% select("Gene.ID","Gene.Name",contains("TPM"))
# 
# colnames(tpm_data) <- c("Gene.ID","Gene.Name",rownames(sample_table))
# saveRDS(tpm_data , "tpm_data.rds")

tpm_data <- readRDS("tpm_data.rds")

## Filter genes with a low expression throughout
# - select rows with max value >= 2
fpkm_data <- fpkm_data[apply(fpkm_data[,3:21], 1, max) >= 2,]

fpkms_counts_only <- fpkm_data[,3:21]
## Log transform the data
log.data = log2(fpkms_counts_only+1)

colnames(log.data) <- colnames(fpkm_data[,3:21])
rownames(log.data) <- make.names(fpkm_data$Gene.Name,unique = TRUE)
## Transpose the matrix back for PCA
t_log.data = t(log.data)

fit <- prcomp(t_log.data)

x="PC1"
y="PC2"
x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == x)]^2/sum(fit$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == y)]^2/sum(fit$sdev^2)))
data <- data.frame(fit$x)
data<-rownames_to_column(data, var = "sample")
data<-left_join(data,rownames_to_column(sample_table, var = "sample_id"), by = c("sample" = "sample_id"))

ggplot(data,aes(PC1,PC2, colour = Phenotype )) + geom_point(size = 3 ) + 
  geom_label_repel(aes(label = Samples), size = 2,box.padding   = 0.1,point.padding = 0.1,
                   segment.color = 'grey50') + theme_minimal() + xlab(x.lab) + ylab(y.lab) +
   ggtitle(label = "FPKM PCA; no quantile normalization")


```


### PCA and [quantile normalization](http://genomicsclass.github.io/book/pages/normalization.html)

```{r}

## Quantile normalization
normalized.data = normalize.quantiles(as.matrix(log.data)) # A quantile normalization.

## Transpose the matrix back for PCA
normalized.data = t(normalized.data)

## re-attach rownames colnames
rownames(normalized.data) <- colnames(log.data)
colnames(normalized.data) <- rownames(log.data)

fit <- prcomp(normalized.data)

x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == x)]^2/sum(fit$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == y)]^2/sum(fit$sdev^2)))
data <- data.frame(fit$x)
data<-rownames_to_column(data, var = "sample")
data<-left_join(data,rownames_to_column(sample_table, var = "sample_id"), by = c("sample" = "sample_id"))

ggplot(data,aes(PC1,PC2, colour = Phenotype )) + geom_point(size = 3 ) + 
  geom_label_repel(aes(label = Samples), size = 2,box.padding   = 0.1,point.padding = 0.1,
                   segment.color = 'grey50') + theme_minimal() + xlab(x.lab) + ylab(y.lab) +
  ggtitle(label = "FPKM PCA; with quantile normalization")

```

### TPMs without normalization

```{r}
## Filter genes with a low expression throughout
# - select rows with max value >= 2
tpm_data <- tpm_data[apply(tpm_data[,3:21], 1, max) >= 2,]

tpm_counts_only <- tpm_data[,3:21]
## Log transform the data
log.data = log2(tpm_counts_only+1)

colnames(log.data) <- colnames(tpm_data[,3:21])
rownames(log.data) <- make.names(tpm_data$Gene.Name,unique = TRUE)
## Transpose the matrix back for PCA
t_log.data = t(log.data)

fit <- prcomp(t_log.data)

x="PC1"
y="PC2"
x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == x)]^2/sum(fit$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == y)]^2/sum(fit$sdev^2)))
data <- data.frame(fit$x)
data<-rownames_to_column(data, var = "sample")
data<-left_join(data,rownames_to_column(sample_table, var = "sample_id"), by = c("sample" = "sample_id"))

ggplot(data,aes(PC1,PC2, colour = Phenotype )) + geom_point(size = 3 ) + 
  geom_label_repel(aes(label = Samples), size = 2,box.padding   = 0.1,point.padding = 0.1,
                   segment.color = 'grey50') + theme_minimal() + xlab(x.lab) + ylab(y.lab) +
   ggtitle(label = "TPM PCA; no quantile normalization")
```

##PCA with counts
###Using [scater](https://bioconductor.org/packages/release/bioc/html/scater.html) normalization 

```{r}
# big_counts <- paste0("myriad/",rownames(sample_table),"/replicate_1/qc/",rownames(sample_table), "_QC.summary.txt/QC.geneCounts.formatted.for.DESeq.txt.gz") %>% map(gzfile) %>% map(read.table, stringsAsFactors = FALSE, sep = "\t")  %>% reduce(left_join, by = "V1") 
# 
# colnames(big_counts) <- c("ensgene", rownames(sample_table))
# rownames(big_counts) <- big_counts$ensgene
# big_counts$ensgene <- NULL
# big_counts <- big_counts[!is.na(rowSums(big_counts)),]
# saveRDS(big_counts,"big_counts.rds")
big_counts <- readRDS("big_counts.rds")
count_matrix <- as.matrix(big_counts)
colnames(count_matrix) <- rownames(sample_table)
rownames(count_matrix) <- rownames(big_counts)

sce <- SingleCellExperiment(list(counts = count_matrix),
                            colData = sample_table)
sce <- sce[rowSums(counts(sce)) >  0,]
sce <- calculateQCMetrics(sce)

```

```{r}
## prepare total count and total features data

my_df <- data.frame("sample_id" = colnames(sce), "total_counts" = sce$total_counts,
                    "total_features" = sce$total_features_by_counts )
ggplot(my_df,aes(total_counts)) + geom_histogram(bins = 10) + 
  theme_minimal() + ggtitle("Histogram of total counts") + ylab("number of samples") + xlab("total counts")
```

```{r}
ggplot(my_df,aes(total_features)) + geom_histogram(bins = 10) + 
  theme_minimal() + ggtitle("Histogram of total features") + ylab("number of samples") + xlab("total features")

```

###PCA without batch effect removal

```{r}
sizeFactors(sce) <- librarySizeFactors(sce)
sce <- normalize(sce)

sce <- runPCA(sce)

data <- reducedDim(sce)
percent_var_PC1<- paste("PC1", sprintf('(%0.1f%% explained var.)', attr(data, 'percentVar')[1] * 100 ))
percent_var_PC2<- paste("PC2", sprintf('(%0.1f%% explained var.)', attr(data, 'percentVar')[2] * 100 ))

data <- data.frame(data)
data<-left_join(rownames_to_column(data), rownames_to_column(sample_table))

ggplot(data,aes(PC1,PC2,colour = Phenotype)) + geom_point(size = 3) + 
  geom_label_repel(label = data$Samples, size = 2,box.padding   = 0.1,point.padding = 0.1,
                   segment.color = 'grey50') + theme_minimal() + xlab(percent_var_PC1) + ylab(percent_var_PC2)

```


## DEseq2

```{r}
big_counts <- big_counts[-c(58736:58739),]

dds <- DESeqDataSetFromMatrix(big_counts, colData = sample_table, design = ~deseq_groups)
my_vst <- vst(dds)

pcaData <- DESeq2::plotPCA(my_vst, intgroup =c("Phenotype","Samples"),returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))


ggplot(pcaData,aes(PC1,PC2,colour = Phenotype)) + geom_point(size = 3 ) +
        xlab(paste0("PC1: ",percentVar[1],"% variance")) +
        ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
          geom_label_repel(aes(label = pcaData$Samples), size = 2,box.padding   = 0.1,point.padding = 0.1,
                   segment.color = 'grey50') +
        theme_minimal()

```

### Genes driving PC1 and PC2

```{r}

top_contribs = function(object) {
  # calculate the variance for each gene
  rv <- rowVars(assay(object))

  # select the 1000 top genes by variance
  select <- order(rv, decreasing=TRUE)[seq_len(min(1000, length(rv)))]

  # perform a PCA on the data in assay(x) for the selected genes
  pca <- prcomp(t(assay(object)[select,]))

  # the contribution to the total variance for each component
  percentVar <- pca$sdev^2 / sum( pca$sdev^2 )

  # Top 20 contributers to PC1 PC2
  PCA1_contrib <- sort(abs(pca$rotation[,1]), decreasing = TRUE )[1:20]
  PCA2_contrib <- sort(abs(pca$rotation[,2]), decreasing = TRUE )[1:20]
  PCA_contrib <- c(PCA1_contrib, PCA2_contrib)
  PCA_contrib <- data.frame("ensgene" = names(PCA_contrib), PCA_contrib)



  PCA_contrib <- cbind(PCA_contrib, data.frame("PC" = c(rep("PC1",20),rep("PC2","20"))))
  PCA_contrib <- left_join(PCA_contrib, grch38, by = "ensgene") %>% dplyr::select("PC","symbol", "biotype")
  return(PCA_contrib)
}

datatable(top_contribs(my_vst))

```

## Differential Gene Expression

### CD33^+^ vs CD33^+^GFP^+^

```{r}
dds$deseq_groups <- relevel(dds$deseq_groups,ref = "2")

deseq <- DESeq(dds)

res <- results(deseq,contrast = c("deseq_groups", "1", "2"))
resOrdered <- res[order(res$padj),]
resOrdered <- resOrdered[complete.cases(resOrdered),]
resOrdered <- data.frame(resOrdered)
resOrdered <- left_join(rownames_to_column(resOrdered,var = "ensgene"), grch38, by = "ensgene") %>% select(symbol,log2FoldChange,padj)

res_for_table <- resOrdered  %>% dplyr::filter(padj < 0.1)
datatable(res_for_table, extensions = 'Buttons', options = list(dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel')))

```


### GSEA CD33^+^ vs CD33^+^GFP^+^

```{r}

hallmark <- gmtPathways("~/genome_apps/GSEA/h.all.v6.2.symbols.gmt")

plot_gsea <- function(res){
res <- res[complete.cases(res),]
res$fcSign <- sign(res$log2FoldChange)
res$logP <- -log10(res$padj)
res$metric <- res$logP/res$fcSign
y<-res[,c("symbol", "metric")]
geneList <- y$metric
names(geneList) <- y$symbol
geneList <- geneList[order(geneList, decreasing = T)]
fgseaRes <- fgsea(hallmark, geneList, nperm=10000, maxSize = 500)
fgseaRes <- fgseaRes[fgseaRes$padj < 0.05,]
plotGseaTable(hallmark[fgseaRes$pathway], geneList, fgseaRes, gseaParam = 0.4)
}

plot_gsea(resOrdered)
```


### DGE CD33^+^GFP^-^ vs CD33^+^GFP^+^

```{r}
res <- results(deseq,contrast = c("deseq_groups", "3", "2"))
resOrdered <- res[order(res$padj),]
resOrdered <- resOrdered[complete.cases(resOrdered),]
resOrdered <- data.frame(resOrdered)
resOrdered <- left_join(rownames_to_column(resOrdered,var = "ensgene"), grch38, by = "ensgene") %>% select(symbol,log2FoldChange,padj) 

res_for_table <- resOrdered %>% dplyr::filter(padj < 0.1)
datatable(res_for_table, extensions = 'Buttons', options = list(dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel')))


```

### GSEA CD33^+^GFP^-^ vs CD33^+^GFP^+^

```{r}
plot_gsea(resOrdered)
```



### DGE CD19^+^ vs CD19^+^GFP^+^

```{r}
dds$deseq_groups <- relevel(dds$deseq_groups,ref="4")
res <- results(deseq,contrast = c("deseq_groups", "5", "4"))
resOrdered <- res[order(res$padj),]
resOrdered <- resOrdered[complete.cases(resOrdered),]
resOrdered <- data.frame(resOrdered)
resOrdered <- left_join(rownames_to_column(resOrdered,var = "ensgene"), grch38, by = "ensgene") %>% select(symbol,log2FoldChange,padj) 

res_for_table <- resOrdered %>% dplyr::filter(padj < 0.1)
datatable(res_for_table, extensions = 'Buttons', options = list(dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel')))


```

### GSEA CD19^+^ vs CD19^+^GFP^+^

```{r}
plot_gsea(resOrdered)
```

